Libraries required for this analysis

knitr::opts_chunk$set(fig.align="center") 
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr) 
library(ggplot2)
library(magrittr)  
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)

theme_set(theme_light())

In our experiement, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then measured the user’s time to complete each of four tasks: 1. Find Extremum 2. Retrieve Value 3. Prediction 4. Exploration

Given a search algorithm (bsf or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict the time it takes the average user to complete each task. In addition, we would like to know if the choice of search algorithm and oracle has any meaninful impact on a user’s completion time for each of these four tasks,

Read in and clean data

time_data = read.csv('processed_completion_time.csv')
time_data <- time_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    log_completion_time = log(completion_time)
  )

Set Priors (based on data from our pilot study)

Units on these are log seconds. We choose to work in log space in order to prevent our model from predicting completion times of less than zero seconds.

prior_mean = 5.69
prior_sd = 0.69

models <- list()

draw_data <- list()

search_differences <- list()
oracle_differences <- list()

seed = 12

task_list = c("1. Find Extremum",
              "2. Retrieve Value",
              "3. Prediction",
              "4. Exploration")

Find Extremum: Building a Model for Time Analysis

data_find_extremum <- subset(time_data, task == "1. Find Extremum")
models$find_extremum <- stan_glm(
    log_completion_time ~ dataset * oracle * search,
    data = data_find_extremum,
    prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
    prior = normal(0, prior_sd, autoscale = FALSE),
    seed = seed
  )
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 9.5e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.95 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.125642 seconds (Warm-up)
## Chain 1:                0.113495 seconds (Sampling)
## Chain 1:                0.239137 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 1.1e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.11 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.097877 seconds (Warm-up)
## Chain 2:                0.08048 seconds (Sampling)
## Chain 2:                0.178357 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 1.7e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.17 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.087203 seconds (Warm-up)
## Chain 3:                0.078331 seconds (Sampling)
## Chain 3:                0.165534 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9e-06 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.09 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.082043 seconds (Warm-up)
## Chain 4:                0.078863 seconds (Sampling)
## Chain 4:                0.160906 seconds (Total)
## Chain 4:

Find Extremum: Diagnostics + Model Evaluation

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(models$find_extremum)
## 
## Model Info:
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      log_completion_time ~ dataset * oracle * search
##  algorithm:    sampling
##  sample:       4000 (posterior sample size)
##  priors:       see help('prior_summary')
##  observations: 59
##  predictors:   8
## 
## Estimates:
##                                        mean   sd   10%   50%   90%
## (Intercept)                           5.5    0.2  5.3   5.5   5.7 
## datasetmovies                        -0.2    0.2 -0.5  -0.2   0.1 
## oracledziban                          0.0    0.2 -0.3   0.0   0.3 
## searchdfs                             0.1    0.2 -0.2   0.1   0.4 
## datasetmovies:oracledziban            0.1    0.3 -0.3   0.1   0.5 
## datasetmovies:searchdfs              -0.2    0.3 -0.6  -0.2   0.2 
## oracledziban:searchdfs               -0.3    0.3 -0.7  -0.3   0.1 
## datasetmovies:oracledziban:searchdfs  0.0    0.4 -0.5   0.0   0.5 
## sigma                                 0.6    0.1  0.5   0.6   0.6 
## 
## Fit Diagnostics:
##            mean   sd   10%   50%   90%
## mean_PPD 5.3    0.1  5.2   5.3   5.5  
## 
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
## 
## MCMC diagnostics
##                                      mcse Rhat n_eff
## (Intercept)                          0.0  1.0  2520 
## datasetmovies                        0.0  1.0  2082 
## oracledziban                         0.0  1.0  2414 
## searchdfs                            0.0  1.0  1980 
## datasetmovies:oracledziban           0.0  1.0  2048 
## datasetmovies:searchdfs              0.0  1.0  1934 
## oracledziban:searchdfs               0.0  1.0  2233 
## datasetmovies:oracledziban:searchdfs 0.0  1.0  2194 
## sigma                                0.0  1.0  3559 
## mean_PPD                             0.0  1.0  4392 
## log-posterior                        0.1  1.0  1513 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

Trace plots help us check whether there is evidence of non-convergence for model.

plot(models$find_extremum)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(models$find_extremum)

Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data$find_extremum <- data_find_extremum %>%
  add_fitted_draws(models$find_extremum, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_find_extremum = ggplot(draw_data$find_extremum, aes(
    x = exp(.value),
    y = condition,
    fill = dataset,
    alpha = 0.5
  )) + stat_halfeye(.width = c(.95, .5)) +
    labs(x = "Time (Seconds)", y = "Condition") 

plot_find_extremum

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_find_extremum <- draw_data$find_extremum %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_find_extremum
## # A tibble: 118 x 16
## # Groups:   participant_id, dataset, oracle, search, task, completion_time,
## #   condition, task_type, log_completion_time [59]
##    participant_id dataset oracle search task  completion_time condition
##             <int> <fct>   <fct>  <fct>  <chr>           <dbl> <chr>    
##  1              1 birdst… compa… bfs    1. F…           420.  compassq…
##  2              2 birdst… compa… bfs    1. F…           365.  compassq…
##  3              3 birdst… compa… bfs    1. F…           300.  compassq…
##  4              4 movies  compa… dfs    1. F…            64.1 compassq…
##  5              5 movies  dziban bfs    1. F…           229.  dziban_b…
##  6              6 birdst… compa… dfs    1. F…           189.  compassq…
##  7              7 movies  compa… dfs    1. F…            97.4 compassq…
##  8              8 movies  dziban dfs    1. F…           379.  dziban_d…
##  9              9 movies  dziban bfs    1. F…           399.  dziban_b…
## 10             10 birdst… compa… bfs    1. F…            80.9 compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## #   log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## #   .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image

Find Extremum: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

find_extremum_predictive_data <- data_find_extremum %>%
    add_fitted_draws(models$find_extremum, seed = seed, re_formula = NA) %>%
    group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$find_extremum <- find_extremum_predictive_data  %>%
    group_by(search, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = search) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$find_extremum$metric = "1. Find Extremum"

search_differences$find_extremum %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",search_differences$find_extremum[1,'search'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

search_differences$find_extremum %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    dataset     diff_in_time .lower .upper .width .point .interval
##   <chr>     <fct>              <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dfs - bfs birdstrikes         2.12  -94.2   96.1   0.95 mean   qi       
## 2 dfs - bfs movies            -38.9  -116.    33.1   0.95 mean   qi       
## 3 dfs - bfs birdstrikes         2.12  -28.7   33.3   0.5  mean   qi       
## 4 dfs - bfs movies            -38.9   -63.8  -13.5   0.5  mean   qi
oracle_differences$find_extremum <- find_extremum_predictive_data  %>%
    group_by(oracle, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = oracle) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$find_extremum$metric = "1. Find Extremum"

oracle_differences$find_extremum %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",oracle_differences$find_extremum[1,'oracle'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

oracle_differences$find_extremum %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle           dataset    diff_in_time .lower .upper .width .point .interval
##   <chr>            <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - compas… birdstrik…       -40.8  -134.   52.5    0.95 mean   qi       
## 2 dziban - compas… movies            -1.27  -75.1  71.1    0.95 mean   qi       
## 3 dziban - compas… birdstrik…       -40.8   -70.9  -9.79   0.5  mean   qi       
## 4 dziban - compas… movies            -1.27  -25.7  23.1    0.5  mean   qi

Retrieve Value: Building a Model for Time Analysis

data_retrieve_value <- subset(time_data, task == "2. Retrieve Value")
models$retrieve_value <- stan_glm(
    log_completion_time ~ dataset * oracle * search,
    data = data_retrieve_value,
    prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
    prior = normal(0, prior_sd, autoscale = FALSE),
    seed = seed
  )
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 2e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.116888 seconds (Warm-up)
## Chain 1:                0.091973 seconds (Sampling)
## Chain 1:                0.208861 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 1e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.1 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.094167 seconds (Warm-up)
## Chain 2:                0.088024 seconds (Sampling)
## Chain 2:                0.182191 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 1.6e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.16 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.097329 seconds (Warm-up)
## Chain 3:                0.08179 seconds (Sampling)
## Chain 3:                0.179119 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 1.2e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.099922 seconds (Warm-up)
## Chain 4:                0.091463 seconds (Sampling)
## Chain 4:                0.191385 seconds (Total)
## Chain 4:

Retrieve Value: Diagnostics + Model Evaluation

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(models$retrieve_value)
## 
## Model Info:
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      log_completion_time ~ dataset * oracle * search
##  algorithm:    sampling
##  sample:       4000 (posterior sample size)
##  priors:       see help('prior_summary')
##  observations: 59
##  predictors:   8
## 
## Estimates:
##                                        mean   sd   10%   50%   90%
## (Intercept)                           5.3    0.2  5.1   5.3   5.5 
## datasetmovies                         0.0    0.2 -0.3   0.0   0.2 
## oracledziban                          0.2    0.2 -0.1   0.2   0.4 
## searchdfs                             0.0    0.2 -0.2   0.0   0.3 
## datasetmovies:oracledziban            0.0    0.3 -0.4   0.0   0.3 
## datasetmovies:searchdfs               0.1    0.3 -0.2   0.1   0.4 
## oracledziban:searchdfs               -0.1    0.3 -0.5  -0.1   0.3 
## datasetmovies:oracledziban:searchdfs -0.1    0.4 -0.6  -0.1   0.3 
## sigma                                 0.5    0.0  0.4   0.5   0.5 
## 
## Fit Diagnostics:
##            mean   sd   10%   50%   90%
## mean_PPD 5.4    0.1  5.3   5.4   5.5  
## 
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
## 
## MCMC diagnostics
##                                      mcse Rhat n_eff
## (Intercept)                          0.0  1.0  2258 
## datasetmovies                        0.0  1.0  2064 
## oracledziban                         0.0  1.0  1711 
## searchdfs                            0.0  1.0  2226 
## datasetmovies:oracledziban           0.0  1.0  1668 
## datasetmovies:searchdfs              0.0  1.0  2054 
## oracledziban:searchdfs               0.0  1.0  1802 
## datasetmovies:oracledziban:searchdfs 0.0  1.0  1905 
## sigma                                0.0  1.0  3660 
## mean_PPD                             0.0  1.0  3854 
## log-posterior                        0.1  1.0  1782 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

Trace plots help us check whether there is evidence of non-convergence for model.

plot(models$retrieve_value)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(models$retrieve_value)

Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data$retrieve_value <- data_retrieve_value %>%
  add_fitted_draws(models$retrieve_value, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_retrieve_value = ggplot(draw_data$retrieve_value, aes(
    x = exp(.value),
    y = condition,
    fill = dataset,
    alpha = 0.5
  )) + stat_halfeye(.width = c(.95, .5)) +
    labs(x = "Time (Seconds)", y = "Condition") 

plot_retrieve_value

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_retrieve_value <- draw_data$retrieve_value %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_retrieve_value
## # A tibble: 118 x 16
## # Groups:   participant_id, dataset, oracle, search, task, completion_time,
## #   condition, task_type, log_completion_time [59]
##    participant_id dataset oracle search task  completion_time condition
##             <int> <fct>   <fct>  <fct>  <chr>           <dbl> <chr>    
##  1              1 birdst… compa… bfs    2. R…            131. compassq…
##  2              2 birdst… compa… bfs    2. R…            182. compassq…
##  3              3 birdst… compa… bfs    2. R…            158. compassq…
##  4              4 movies  compa… dfs    2. R…            274. compassq…
##  5              5 movies  dziban bfs    2. R…            242. dziban_b…
##  6              6 birdst… compa… dfs    2. R…            191. compassq…
##  7              7 movies  compa… dfs    2. R…            186. compassq…
##  8              8 movies  dziban dfs    2. R…            375. dziban_d…
##  9              9 movies  dziban bfs    2. R…            263. dziban_b…
## 10             10 birdst… compa… bfs    2. R…            366. compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## #   log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## #   .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image

Retrieve Value: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

retrieve_value_predictive_data <- data_retrieve_value %>%
    add_fitted_draws(models$retrieve_value, seed = seed, re_formula = NA) %>%
    group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$retrieve_value <- retrieve_value_predictive_data  %>%
    group_by(search, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = search) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$retrieve_value$metric = "2. Retrieve Value"

search_differences$retrieve_value %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",search_differences$retrieve_value[1,'search'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

search_differences$retrieve_value %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    dataset     diff_in_time .lower .upper .width .point .interval
##   <chr>     <fct>              <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dfs - bfs birdstrikes        -2.23  -76.1   73.4   0.95 mean   qi       
## 2 dfs - bfs movies              2.98  -66.4   70.5   0.95 mean   qi       
## 3 dfs - bfs birdstrikes        -2.23  -28.1   22.3   0.5  mean   qi       
## 4 dfs - bfs movies              2.98  -18.9   26.1   0.5  mean   qi
oracle_differences$retrieve_value <- retrieve_value_predictive_data  %>%
    group_by(oracle, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = oracle) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$retrieve_value$metric = "2. Retrieve Value"

oracle_differences$retrieve_value %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",oracle_differences$retrieve_value[1,'oracle'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

oracle_differences$retrieve_value %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle          dataset    diff_in_time  .lower .upper .width .point .interval
##   <chr>           <fct>             <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - compa… birdstrik…        26.1  -48.3    102.    0.95 mean   qi       
## 2 dziban - compa… movies            -1.24 -71.5     67.6   0.95 mean   qi       
## 3 dziban - compa… birdstrik…        26.1    0.774   51.4   0.5  mean   qi       
## 4 dziban - compa… movies            -1.24 -23.8     23.0   0.5  mean   qi

Prediction: Building a Model for Time Analysis

data_prediction <- subset(time_data, task == "3. Prediction")
models$prediction <- stan_glm(
    log_completion_time ~ dataset * oracle * search,
    data = data_prediction,
    prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
    prior = normal(0, prior_sd, autoscale = FALSE),
    seed = seed
  )
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 2e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.125573 seconds (Warm-up)
## Chain 1:                0.121491 seconds (Sampling)
## Chain 1:                0.247064 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 3.6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.36 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.111821 seconds (Warm-up)
## Chain 2:                0.101047 seconds (Sampling)
## Chain 2:                0.212868 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 1.2e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.12 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.118462 seconds (Warm-up)
## Chain 3:                0.132986 seconds (Sampling)
## Chain 3:                0.251448 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 2e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.2 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.098254 seconds (Warm-up)
## Chain 4:                0.08518 seconds (Sampling)
## Chain 4:                0.183434 seconds (Total)
## Chain 4:

Prediction: Diagnostics + Model Evaluation

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(models$prediction)
## 
## Model Info:
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      log_completion_time ~ dataset * oracle * search
##  algorithm:    sampling
##  sample:       4000 (posterior sample size)
##  priors:       see help('prior_summary')
##  observations: 59
##  predictors:   8
## 
## Estimates:
##                                        mean   sd   10%   50%   90%
## (Intercept)                           6.3    0.1  6.1   6.3   6.5 
## datasetmovies                         0.0    0.2 -0.2   0.0   0.3 
## oracledziban                          0.0    0.2 -0.2   0.0   0.2 
## searchdfs                             0.3    0.2  0.1   0.3   0.6 
## datasetmovies:oracledziban            0.2    0.2 -0.1   0.2   0.5 
## datasetmovies:searchdfs               0.1    0.2 -0.2   0.1   0.4 
## oracledziban:searchdfs               -0.3    0.3 -0.6  -0.3   0.1 
## datasetmovies:oracledziban:searchdfs -0.2    0.3 -0.6  -0.2   0.2 
## sigma                                 0.4    0.0  0.4   0.4   0.5 
## 
## Fit Diagnostics:
##            mean   sd   10%   50%   90%
## mean_PPD 6.4    0.1  6.4   6.5   6.5  
## 
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
## 
## MCMC diagnostics
##                                      mcse Rhat n_eff
## (Intercept)                          0.0  1.0  2330 
## datasetmovies                        0.0  1.0  2082 
## oracledziban                         0.0  1.0  2138 
## searchdfs                            0.0  1.0  1919 
## datasetmovies:oracledziban           0.0  1.0  2045 
## datasetmovies:searchdfs              0.0  1.0  1680 
## oracledziban:searchdfs               0.0  1.0  2100 
## datasetmovies:oracledziban:searchdfs 0.0  1.0  1834 
## sigma                                0.0  1.0  2899 
## mean_PPD                             0.0  1.0  4243 
## log-posterior                        0.1  1.0  1565 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

Trace plots help us check whether there is evidence of non-convergence for model.

plot(models$prediction)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(models$prediction)

Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data$prediction <- data_prediction %>%
  add_fitted_draws(models$prediction, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_prediction = ggplot(draw_data$prediction, aes(
    x = exp(.value),
    y = condition,
    fill = dataset,
    alpha = 0.5
  )) + stat_halfeye(.width = c(.95, .5)) +
    labs(x = "Time (Seconds)", y = "Condition") 

plot_prediction

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_prediction <- draw_data$prediction %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_prediction
## # A tibble: 118 x 16
## # Groups:   participant_id, dataset, oracle, search, task, completion_time,
## #   condition, task_type, log_completion_time [59]
##    participant_id dataset oracle search task  completion_time condition
##             <int> <fct>   <fct>  <fct>  <chr>           <dbl> <chr>    
##  1              1 birdst… compa… bfs    3. P…            529. compassq…
##  2              2 birdst… compa… bfs    3. P…            558. compassq…
##  3              3 birdst… compa… bfs    3. P…            555. compassq…
##  4              4 movies  compa… dfs    3. P…           1036. compassq…
##  5              5 movies  dziban bfs    3. P…            659. dziban_b…
##  6              6 birdst… compa… dfs    3. P…            592. compassq…
##  7              7 movies  compa… dfs    3. P…            537. compassq…
##  8              8 movies  dziban dfs    3. P…            498. dziban_d…
##  9              9 movies  dziban bfs    3. P…           1140. dziban_b…
## 10             10 birdst… compa… bfs    3. P…            712. compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## #   log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## #   .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image

Prediction: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

prediction_predictive_data <- data_prediction %>%
    add_fitted_draws(models$prediction, seed = seed, re_formula = NA) %>%
    group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$prediction <- prediction_predictive_data  %>%
    group_by(search, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = search) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$prediction$metric = "3. Prediction"

search_differences$prediction %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",search_differences$prediction[1,'search'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

search_differences$prediction %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    dataset     diff_in_time .lower .upper .width .point .interval
##   <chr>     <fct>              <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dfs - bfs birdstrikes         135.  -41.2   314.   0.95 mean   qi       
## 2 dfs - bfs movies              107.  -91.0   305.   0.95 mean   qi       
## 3 dfs - bfs birdstrikes         135.   74.5   195.   0.5  mean   qi       
## 4 dfs - bfs movies              107.   43.8   175.   0.5  mean   qi
oracle_differences$prediction <- prediction_predictive_data  %>%
    group_by(oracle, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = oracle) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$prediction$metric = "3. Prediction"

oracle_differences$prediction %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",oracle_differences$prediction[1,'oracle'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

oracle_differences$prediction %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle           dataset    diff_in_time .lower .upper .width .point .interval
##   <chr>            <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - compas… birdstrik…        -84.9  -273.   93.1   0.95 mean   qi       
## 2 dziban - compas… movies            -52.7  -250.  143.    0.95 mean   qi       
## 3 dziban - compas… birdstrik…        -84.9  -147.  -23.2   0.5  mean   qi       
## 4 dziban - compas… movies            -52.7  -118.   12.1   0.5  mean   qi

Exploration: Building a Model for Time Analysis

data_exploration <- subset(time_data, task == "4. Exploration")
models$exploration <- stan_glm(
    log_completion_time ~ dataset * oracle * search,
    data = data_exploration,
    prior_intercept = normal(prior_mean, prior_sd, autoscale = FALSE),
    prior = normal(0, prior_sd, autoscale = FALSE),
    seed = seed
  )
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 2.2e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.22 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.141074 seconds (Warm-up)
## Chain 1:                0.112687 seconds (Sampling)
## Chain 1:                0.253761 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 1.6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.16 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.113867 seconds (Warm-up)
## Chain 2:                0.110881 seconds (Sampling)
## Chain 2:                0.224748 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 1.9e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.19 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.118154 seconds (Warm-up)
## Chain 3:                0.133708 seconds (Sampling)
## Chain 3:                0.251862 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 1.3e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.13 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.110991 seconds (Warm-up)
## Chain 4:                0.113861 seconds (Sampling)
## Chain 4:                0.224852 seconds (Total)
## Chain 4:

Exploration: Diagnostics + Model Evaluation

In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.

summary(models$exploration)
## 
## Model Info:
##  function:     stan_glm
##  family:       gaussian [identity]
##  formula:      log_completion_time ~ dataset * oracle * search
##  algorithm:    sampling
##  sample:       4000 (posterior sample size)
##  priors:       see help('prior_summary')
##  observations: 59
##  predictors:   8
## 
## Estimates:
##                                        mean   sd   10%   50%   90%
## (Intercept)                           6.6    0.1  6.4   6.6   6.7 
## datasetmovies                         0.0    0.1 -0.1   0.0   0.2 
## oracledziban                         -0.1    0.1 -0.3  -0.1   0.1 
## searchdfs                             0.0    0.1 -0.1   0.0   0.2 
## datasetmovies:oracledziban            0.1    0.2 -0.2   0.1   0.3 
## datasetmovies:searchdfs              -0.2    0.2 -0.4  -0.2   0.0 
## oracledziban:searchdfs                0.1    0.2 -0.1   0.1   0.3 
## datasetmovies:oracledziban:searchdfs  0.0    0.3 -0.3   0.0   0.4 
## sigma                                 0.3    0.0  0.2   0.3   0.3 
## 
## Fit Diagnostics:
##            mean   sd   10%   50%   90%
## mean_PPD 6.6    0.1  6.5   6.6   6.6  
## 
## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
## 
## MCMC diagnostics
##                                      mcse Rhat n_eff
## (Intercept)                          0.0  1.0  2294 
## datasetmovies                        0.0  1.0  1991 
## oracledziban                         0.0  1.0  1625 
## searchdfs                            0.0  1.0  1990 
## datasetmovies:oracledziban           0.0  1.0  1496 
## datasetmovies:searchdfs              0.0  1.0  1784 
## oracledziban:searchdfs               0.0  1.0  1572 
## datasetmovies:oracledziban:searchdfs 0.0  1.0  1602 
## sigma                                0.0  1.0  3187 
## mean_PPD                             0.0  1.0  3676 
## log-posterior                        0.1  1.0  1741 
## 
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).

Trace plots help us check whether there is evidence of non-convergence for model.

plot(models$exploration)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(models$exploration)

Using draws from the posterior, we can visualize parameter effects and average response. Be sure to apply an exponential transform toour log-transformed times to make it interpretable! The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

draw_data$exploration <- data_exploration %>%
  add_fitted_draws(models$exploration, seed = seed, re_formula = NA)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
plot_exploration = ggplot(draw_data$exploration, aes(
    x = exp(.value),
    y = condition,
    fill = dataset,
    alpha = 0.5
  )) + stat_halfeye(.width = c(.95, .5)) +
    labs(x = "Time (Seconds)", y = "Condition") 

plot_exploration

We can get the numeric values of the interval boundaries shown above with mean_qi

fit_info_exploration <- draw_data$exploration %>% mean_qi(exp(.value), .width = c(.95, .5))
fit_info_exploration
## # A tibble: 118 x 16
## # Groups:   participant_id, dataset, oracle, search, task, completion_time,
## #   condition, task_type, log_completion_time [59]
##    participant_id dataset oracle search task  completion_time condition
##             <int> <fct>   <fct>  <fct>  <chr>           <dbl> <chr>    
##  1              1 birdst… compa… bfs    4. E…            765. compassq…
##  2              2 birdst… compa… bfs    4. E…            682. compassq…
##  3              3 birdst… compa… bfs    4. E…            955. compassq…
##  4              4 movies  compa… dfs    4. E…            950. compassq…
##  5              5 movies  dziban bfs    4. E…            596. dziban_b…
##  6              6 birdst… compa… dfs    4. E…            597. compassq…
##  7              7 movies  compa… dfs    4. E…            670. compassq…
##  8              8 movies  dziban dfs    4. E…            880. dziban_d…
##  9              9 movies  dziban bfs    4. E…            830. dziban_b…
## 10             10 birdst… compa… bfs    4. E…            914. compassq…
## # … with 108 more rows, and 9 more variables: task_type <chr>,
## #   log_completion_time <dbl>, .row <int>, `exp(.value)` <dbl>, .lower <dbl>,
## #   .upper <dbl>, .width <dbl>, .point <chr>, .interval <chr>
## Saving 7 x 5 in image

Exploration: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).

exploration_predictive_data <- data_exploration %>%
    add_fitted_draws(models$exploration, seed = seed, re_formula = NA) %>%
    group_by(search, oracle, dataset, .draw)
## Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.
search_differences$exploration <- exploration_predictive_data  %>%
    group_by(search, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = search) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$exploration$metric = "4. Exploration"

search_differences$exploration %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",search_differences$exploration[1,'search'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

search_differences$exploration %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    dataset     diff_in_time  .lower .upper .width .point .interval
##   <chr>     <fct>              <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dfs - bfs birdstrikes         61.9  -87.5   214.    0.95 mean   qi       
## 2 dfs - bfs movies             -70.1 -210.     76.9   0.95 mean   qi       
## 3 dfs - bfs birdstrikes         61.9    9.89  112.    0.5  mean   qi       
## 4 dfs - bfs movies             -70.1 -117.    -23.9   0.5  mean   qi
oracle_differences$exploration <- exploration_predictive_data  %>%
    group_by(oracle, dataset, .draw) %>%
    summarize(time = weighted.mean(exp(.value))) %>%
    compare_levels(time, by = oracle) %>%
    rename(diff_in_time = time) 
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$exploration$metric = "4. Exploration"

oracle_differences$exploration %>%
      ggplot(aes(x = diff_in_time, y = metric, fill = dataset, alpha = 0.5)) +
      xlab(paste0("Expected Difference in Completion Time (",oracle_differences$exploration[1,'oracle'],")")) + 
      ylab("Task")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal() 

We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.

oracle_differences$exploration %>% mean_qi(diff_in_time, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle          dataset    diff_in_time  .lower .upper .width .point .interval
##   <chr>           <fct>             <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 dziban - compa… birdstrik…        -31.0 -186.    118.    0.95 mean   qi       
## 2 dziban - compa… movies             38.7 -100.    177.    0.95 mean   qi       
## 3 dziban - compa… birdstrik…        -31.0  -79.7    20.0   0.5  mean   qi       
## 4 dziban - compa… movies             38.7   -8.00   86.5   0.5  mean   qi

Summary Plots

Plot all of the posterior draws on one plot

all_draws <- rbind(draw_data$find_extremum, draw_data$retrieve_value, draw_data$prediction, draw_data$exploration)
 plot = ggplot(all_draws, aes(
    x = exp(.value),
    y = task,
    fill = search,
    alpha = 0.5
  )) + stat_halfeye(.width = c(.95, .5)) +
    labs(x = "Time (Seconds)", y = "Task") +  facet_grid(. ~ dataset)
## Saving 7 x 5 in image

Putting the all of the plots for search algorithm and oracle differences on the same plot:

combined_search_differences <- rbind(search_differences$find_extremum, search_differences$retrieve_value, search_differences$prediction, search_differences$exploration)

combined_search_differences$metric <- factor(combined_search_differences$metric, levels=rev(task_list))

search_differences_plot <- combined_search_differences %>%
      ggplot(aes(x = diff_in_time, y = metric)) +
      xlab(paste0("Time Difference (Seconds): ", combined_search_differences[1,'search'])) +
      ylab("Task") +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

search_differences_plot

search_differences_plot_split_by_dataset <- combined_search_differences %>%
      ggplot(aes(x = diff_in_time, y = metric, fill= dataset)) +
     xlab(paste0("Time Difference (Seconds): ", combined_search_differences[1,'search'])) +
      ylab("Task") +
  facet_grid(. ~ dataset) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

search_differences_plot_split_by_dataset

Now for oracle differences

combined_oracle_differences <- rbind(oracle_differences$find_extremum, oracle_differences$retrieve_value, oracle_differences$prediction, oracle_differences$exploration)

combined_oracle_differences$metric <- factor(combined_oracle_differences$metric, levels=rev(task_list))

oracle_differences_plot <- combined_oracle_differences %>%
      ggplot(aes(x = diff_in_time, y = metric)) +
      xlab(paste0("Time Difference (Seconds): ", combined_oracle_differences[1,'oracle'])) +
      ylab("Task") +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

oracle_differences_plot

oracle_differences_plot_split_by_dataset <- combined_oracle_differences %>%
      ggplot(aes(x = diff_in_time, y = metric, fill= dataset)) +
      xlab(paste0("Time Difference (Seconds): ", combined_oracle_differences[1,'oracle'])) +
      ylab("Task") +
  facet_grid(. ~ dataset) +
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

oracle_differences_plot_split_by_dataset